---
title: "Predicting Nephrotoxic Acute Kidney Injury"
author: "Chuan Zhou, Karyn E. Yonekawa and Davene R. Wright"
date: '`r format(Sys.time(), "%d %B, %Y")`'
output: 
  word_document: null
pdf_document: null
html_document: default
highlight: "tango"
toc: yes
toc_depth: 2
editor_options: 
  chunk_output_type: console
---
  
## Introduction
  
In this analysis we try to estimate the associations between specific combinations of nephrotoxins that were ordered for patients and AKI risk. The estimated associations will then inform us about how to develop an alert system that can accurately prompt providers about potential AKI risk (good sensitivity), yet without incurring too many false alarms so that it will trigger alarm fatigue (good specificity).

There are more than 40 nephrotoxins that are potentially prescribed. However, they don't have the same chance of be ordered, either alone or in combination. This is one key motivation for our choice of sparse combination modeling approach. If we use logistic regression, not only we implicitly assume all nephrotoxins have the same chance of being ordered, we may also end up with combinations that won't actually occur in practice. 

In addition, we want to identify solo or combinations of nephrotoxins that are at high risk for AKI and should be avoid, and those at low risk for AKI. Such information would be more informative to providers for their decision making.

```{r setup, include=FALSE}
## set-up
knitr::opts_chunk$set(echo = TRUE, cache = TRUE)
require("knitr")
#opts_knit$set(root.dir = "O:/AKI")
```

```{r}  
## clean up space
rm(list=ls())

## load packages
library(tidyverse)
library(dplyr)    # for filter()
library(stringr)  # for str_detect
library(caret)   # CART 
library(gmodels) # CrossTable
library(ggplot2) # ggplot
library(plotly)  # interactive scatter plot
library(ROCR)
library(pROC)    # roc
#library(cvTools)
library(MASS)   # prop.test
library(rpart)
library(partykit)
library(glmnet) # Lasso for logistic
library(prodlim) # row.match
```

In this analysis we will only concern ourselves with AKI risk and nephrotoxins. Performance of current alert system or associations between encounter characteristics and AKI are not the focus of the current study but will be explored in future sutdy.

## Pre-NINJA vs. Post-NINJA

On 2014-11-20 Seattle Children's switch from old system to NINJA. The list of medications was expanded to include more nephrotoxins. Alert was triggered if 3 or more nephrotoxins were ordered. The pre-NINJA alert system had sensitivity 46.9%, compared with post-NINJA 43.3% (p=0.22). The pre-NINJA specificity was 73.6% vs. 89.3% during post-NINJA period (p<.001).


```{r}
## set working directory
setwd("O:/Work/Childrens/Rita/QIscholars/AKI")


## read in most current data: 4 years!
## dat.0a contains encounter ID, AKI event indicator (measured by creatinine), and nephrotoxins
dat.0a = read.csv("NTMx_AKI_workingdata.csv", header=T)

dim(dat.0a)
vnames = names(dat.0a)              # 59 nephrotoxins (including mitomycin), 44 in the 13-15 analysis
colsum = apply(dat.0a[,-1], 2, sum) # count how many times a nephrotoxin was ordered
vlist = vnames[-1][colsum>=0]       # keep columns with only 0's
vnames[-1][colsum==0]               # 4 drugs were never prescribed during study period
length(vlist[-1])                   # 55=59-4 meds ever prescribed, 1st column is AKI
```

## Develope our new algorithm

### Data splicing

Since we are not concerned with alerts, we will use both pre- and post-NINJA data to develop our new algorithm. We randomly splitted the data into 75% training dataset and 25% validation sample.

```{r}
###
### prepare training and testing datasets
###
set.seed(927401)

# cleaned dataset, extract only AKI and nephortoxic meds data
fm = formula(paste(vlist[1],"~", paste(vlist[-c(1)], collapse="+"), sep=""))
dat = dat.0a[, names(dat.0a)%in%vlist] 

dat$AKI = as.factor(dat$AKI)              # make sure AKI is a factor
#levels(dat$AKI) = list("Pos"=c("1"), "Neg"=c("0"))
inTrain = createDataPartition(y=dat$AKI,   ## the outcome data are needed
                              p=0.75,      ## percentage of data in training data
                              list=FALSE   ## the format of the results
                             )
training = dat[inTrain,]
testing = dat[-inTrain,]
```

### Logistic regression approach

```{r}
##############################################################
## Descriptive statistics
##############################################################

CrossTable(dat$AKI) # AKI prevalence = 3.4%
n.meds = apply(dat[,-1], 1, sum) # row sums  
CrossTable(n.meds)  # at most 15 meds were ordered per encounter!!!

# number of meds is significantly associated with risk of AKI
tab = CrossTable(n.meds, dat$AKI, chisq = TRUE)
theta.nmeds = tab$prop.row[,"1"]
df = data.frame(AKI=theta.nmeds, n.meds=c(min(n.meds): max(n.meds)))

## logistic regression of AKI on number of meds
fit = glm(AKI~n.meds, data=dat, family=binomial)
exp(coef(fit)[2])

pdf("AKI_vs_Nmeds_2013to2017.pdf")
ggplot(df, aes(n.meds, AKI)) + 
  geom_bar(stat="identity") +
  geom_point(colour="red") +
  geom_path() +
  ggtitle("OR=1.70")
dev.off()

## P(AKI | each med=1)
meds = vlist[-1]
p.med = apply(dat[,-1], 2, mean)*100 # how often the med is used
p.AKI=rep(0, length(meds))           # how often AKI occur if a med is used (regardless alone or in combination)  
for (i in 1:length(meds)) {
  y = dat$AKI[dat[,1+i]==1] # factor
  tab = table(y)
  p.AKI[i] = 100*tab[2]/sum(tab)
}
df=data.frame(meds, p.AKI, p.med)

# AMPHOTERICIN_B turned out to be non-informative because it is always given with other meds
pdf("AKI_vs_meds_2013to2017.pdf")
ggplot(df, aes(meds, p.AKI, size=p.med))+
  geom_point(colour="red") +
  geom_hline(yintercept=25) +
  #geom_bar(stat="identity") +
  coord_flip() 
  #ylab("Prevalence of AKI when a med is given (alone or in combination with others)") +
  #xlab("Medications")
dev.off()

# AMPHOTERICIN_B is always given in combination with other meds
subset(dat, AMPHOTERICIN_B==1)
n.meds[dat$AMPHOTERICIN_B==1]

##
## logistic regression of AKI on meds, while conditioning on number of meds
## Using training dataset
##
dd = training[n.meds[inTrain]==1,]
colsum = apply(dd[,-1], 2, sum)
vlist.0 = names(colsum)[colsum>0]
vlist.0
nv = length(vlist.0) 

# joint model with vancomycin as reference (intercept)
f = as.formula(paste("AKI ~", paste(vlist.0[-nv], collapse="+")))
fit = glm(f, data=dd, family=binomial(link=log))  # log-binomial model to get actual risk (not odds!)
rr = exp(coef(fit))
rr[1] # this is the risk of AKI when Vancomycin is the only med given
CrossTable(dd$AKI, dd$VANCOMYCIN, chisq=TRUE) # compare with rr[1]
p_aki_med1 = c(rr[1]*rr[-1], rr[1])*100
names(p_aki_med1) = vlist.0
round(p_aki_med1, 4)
```

### Sparse modeling

We apply a Naive Bayes approach for identifying AKI risk from drug combos.

For the 59 medications, a total of 1737 (it's still sparse compared to all 2^59 combinations!) meds combinations were ordered during the 2-year study period. Their prevalence and corresponding AKI risk are estimated by proportions. 

```{r}
##################################################################################################
## This is the main code chunk that generates the results!!!
##################################################################################################
##
## when k (k=1, ..., 11) meds are prescribed
##
n.meds = apply(dat[,-1], 1, sum)       # row sums  
summary(n.meds)                        # max=15

indx = n.x = v.x = NULL                # place holders
dd.combo = NULL
for (k in c(min(n.meds):max(n.meds))) {
  dd = training[n.meds[inTrain]==k,]  # k meds were ordered, using training data
  colsum = apply(dd[,-1], 2, sum)     # sums of each column to see which meds were not used
  vlist.1 = names(colsum)[colsum>0]   # subset of meds responsible for the k meds
  vlist.1
  nv = length(vlist.1)          
  
  dd.1=unique(dd[,names(dd)%in%vlist.1]) # unique meds combinations
  n.combo = nrow(dd.1)                   # number of unique meds combinations
  n.combo                      
  
  indx=c(indx, as.numeric(row.names(dd.1))) # indices of unique combinations/states
  meds.combo = vector("list", dim(dd.1)[1]) # initialize a list
  meds = rep(NA, n.combo)
  p.meds.n = p.AKI = n.pts = n.AKI = rep(0, n.combo)
  
  for (i in 1:n.combo) {
    meds.combo[[i]] = vlist.1[dd.1[i,]>0]  # for each row of dd.1
    meds[i] = paste(meds.combo[[i]], collapse=" + ")
    dd.2 = dd[, names(dd)%in%c("AKI", meds.combo[[i]])] # to get dd.3
    if (k==1) { # tricky!
        dd.3 = dd.2[dd.2[,2]==1,] # if just 1 med was ordered    
      } else {
        dd.3 = dd.2[rowSums(dd.2[,-1])==k,] # if >1 meds were ordered
      }
    p.meds.n[i] = 100*nrow(dd.3)/nrow(dd) # among patients who received k meds, prevalence of a specific combo
    n.pts[i] = nrow(dd.3) # number of patients with specific combo
    tab = table(dd.3$AKI) 
    n.AKI[i] = tab[2]     # number of AKI cases among patients with specific combo
    p.AKI[i] = 100*tab[2]/sum(tab)
  }
  
  n.x = c(n.x, n.pts)
  v.x = c(v.x, n.AKI)
  df = data.frame(n.meds=k, p.nmeds=100*nrow(dd)/nrow(training), meds, n.pts, n.AKI, 
                  p.meds.n, p.AKI)
  df$p.meds = df$p.nmeds * df$p.meds.n/100 # p.nmeds = Pr(n meds ordered), p.meds.n=Pr(meds combo | n meds ordered)
  
  write.table(df[order(df$p.AKI, decreasing=T),], file="Nephrotoxin_AKI.csv", 
              append=ifelse(k==1, FALSE, TRUE), 
              row.names=F, col.names=ifelse(k==1, TRUE, FALSE), quote=F, sep=",")
  # store all meds combination in one dataset
  dd.combo= rbind(dd.combo, df)
  
  ggplot(df, aes(meds, p.AKI, size=p.meds)) + 
    geom_point(colour="red") +
    coord_flip() +
    labs(y="P(AKI=1 | Nephrotoxin(s)=1)", x="Nephrotoxin") +
    theme(axis.text.y=element_text(size=rel(0.25))) +
    ggtitle(paste("When",k, "meds are given")) +
    ggsave(paste("AKI_vs_combo",k,".pdf",sep=""), dpi=600)
  
}
dd.combo=data.frame(dd.combo)

## scatter plot of prevalence and AKI risk
dd.n2 = dd.combo %>%
  filter(n.meds==2) %>%
  arrange(p.AKI, p.meds)
write.csv(dd.n2, file="AKI risk for 2 nephrotoxin combinations.csv", quote=F, row.names = FALSE)

head(dd.n2)
tail(dd.n2)
sum(dd.n2$p.meds) # among all orders, orders with 2 meds account for 27.56%

## Build a shiny app to dynamically track drug combo and AKI risk?
p=ggplot(dd.n2, aes(x=p.meds, y=p.AKI, text=meds)) +
  geom_point() +
  labs(x="Prevalence of meds combination", y="Prevalence of AKI events") +
  ggtitle("AKI risk vs. Meds combo")
ggplotly(p, tooltip="text") # from the viewer, use save as to same to webpage, build it into shiny??? 

p=ggplot(dd.n2, aes(x=p.meds, y=p.AKI, text=meds)) +
  geom_point() +
  geom_hline(yintercept=10) +
  lims(x=c(0, 1.5)) +
  labs(x="Prevalence of meds combination", y="Prevalence of AKI events") +
  ggtitle("AKI risk vs. Meds combo")
ggplotly(p, tooltip="text") # f

## interactive plot for fun
plot_ly(dd.n2, x = ~p.meds, y = ~p.AKI, text = ~meds, type = 'scatter', mode = 'markers', 
        marker = list(size = ~p.AKI/5, color = "Red",opacity = 0.5)) %>% 
  layout(title = 'AKI risk vs. 2-Meds Combinations', 
         xaxis = list(showgrid = FALSE, title = "Prevalence of meds combo"), 
         yaxis = list(showgrid = FALSE, title = "Prevalence of AKI events")) %>% 
  add_annotations(x = dd.n2$p.meds, y = dd.n2$p.AKI, text = dd.n2$meds, showarrow = FALSE)

```

```{r}
########################################################
## Find marginal probability of AKI for a give med 
## (ie, averaged over all combos containing that med)
########################################################

#dd.combo = read.csv("Nephrotoxin_AKI.csv", header=T)

# our new weights are based on estimated risk of AKI given the med
pAKI.m = NULL
for (m in vlist[-1]) {
  d = dd.combo %>% filter(str_detect(meds, m))
  if (dim(d)[1]==0) {
      p=0 
    } else {
      ### HERE IS THE MEAT OF THE ALGORITHM!!!
      p=sum(d$p.nmeds * d$p.meds * d$p.AKI)/sum(d$p.nmeds * d$p.meds) # Bayes formula!!! 
    }
  pAKI.m = c(pAKI.m, round(p, 0))  #round to integers
}

df = data.frame(meds=vlist[-1], weight=pAKI.m)
df[order(df$weight),]
w_new = df$weight[order(df$meds)]
  
ggplot(df, aes(meds, pAKI.m, size=pAKI.m)) + 
  geom_point(colour="red") +
  geom_hline(yintercept=25) +
  coord_flip() +
  labs(y="P(AKI | Nephrotoxin)", x="Nephrotoxin") +
  theme(axis.text.y=element_text(size=rel(0.75))) +
  ggtitle("Risk of AKI") +
  ggsave("AKI_vs_med_marginal_2013to2017.png", width=7, height=7, units="in")

##
## use the weighted sum to generate a risk score on testing sample!
##
yhat = as.matrix(testing[,-1])%*% (pAKI.m)  
myRoc = roc(testing$AKI, c(yhat), auc.polygon=T, grid=T)

# find optimal cutoffs: based on distance to the origin
cutoffs = seq(0, max(yhat), length.out = 1000)
out = NULL
for (i in 1:length(cutoffs)) {
  AKI.hat = ifelse(yhat>=cutoffs[i], 1, 0)  # Boolean function given cutoff
  sens = sum(AKI.hat[testing$AKI==1]==1)/sum(testing$AKI==1)
  spec = sum(AKI.hat[testing$AKI==0]==0)/sum(testing$AKI==0)
  d = sqrt((1-sens)^2+(1-spec)^2) 
  out = rbind(out, c(cutoffs[i], sens, spec, d))
}
out = as.data.frame(out)
names(out) = c("cutoff", "sens", "spec", "dist")
f.1 = out[order(out$dist),][1,]      # achieve 83% sensitivity and 75% specificity
f.1 

# or we can find the best sensitivity with specificity>85% to reduce false alarms
out.1 = subset(out, spec>=0.9)
f.2 = out.1[order(-out.1$sens),][1,]
f.2 
eAKI = (yhat>f.2$cutoff)
CrossTable(testing$AKI, eAKI)

# or we find max(sens+spec) while requiring sens>80% and spec>60%
# require higher spec to reduce false alarm fatigue
out.1 = subset(out, spec>=0.8 & sens>=0.70)
f.3 = out.1[order(-(out.1$spec + out.1$sens)),][1,]
f.3 
eAKI = (yhat>f.3$cutoff)
CrossTable(testing$AKI, eAKI)

# or we can use tiered cutoffs
nmeds.testing = apply(testing[,-1], 1, sum)
nmeds.testing[nmeds.testing>=4]=4 # 4 and 4+ into one group
f.4=NULL
for (i in 1:4) {
  testing.i = testing[nmeds.testing==i,]
  yhat.i = yhat[nmeds.testing==i]
  cutoffs = seq(0, max(yhat.i), length.out = 1000)
  
  out = NULL
  for (j in 1:length(cutoffs)) {
    AKI.hat = ifelse(yhat.i>cutoffs[j], 1, 0)  # Boolean function given cutoff
    sens = sum(AKI.hat[testing.i$AKI==1]==1)/sum(testing.i$AKI==1)
    spec = sum(AKI.hat[testing.i$AKI==0]==0)/sum(testing.i$AKI==0)
    d = sqrt((1-sens)^2+(1-spec)^2) 
    out = rbind(out, c(i, cutoffs[i], sens, spec, d))
  }
  out = as.data.frame(out)
  names(out) = c("nmeds", "cutoff", "sens", "spec", "dist")
  #out.1 = subset(out, spec>=0.85)
  #f.3 = rbind(f.3, out.1[order(-out.1$sens),][1,])  
  f.4 = rbind(f.4, out[order(out$dist),][1,])
}
f.4

# ROC on testing data
jpeg("AKI_ROC_marginal.jpeg", width=1024)
f = f.1
plot(myRoc, print.auc=T)
plot.roc(smooth(myRoc), add=T, col="blue")
points(f$spec, f$sens, pch=16, col="red", cex=1.5)
text(f$spec-0.3, f$sens+0.05, 
     paste("(cutoff=", round(f$cutoff,2), ", Spec=", round(f$spec, 2), 
           ", Sens=", round(f$sens,2), ")", sep=""), 
     cex=0.75)
dev.off()
```

```{r}
# compare the new weights and cutoffs on the entire data and compare to pre- and 
# post-NINJA alert algorithms
score_new = d_post%*%w_new
alert_new = ifelse(score_new>=14, 1, 0)
CrossTable(aki, alert_new, chisq=T)

alert_new = ifelse(score_new>=22, 1, 0)
CrossTable(aki, alert_new, chisq=T)

#pre alert system
CrossTable(aki, alert_pre, chisq=T)
binom.test(272, 580)
binom.test(10455, 14199)

# post
CrossTable(aki, alert_post, chisq=T)
binom.test(251, 580)
binom.test(12682, 14199)

# pre vs. post
prop.test(c(272, 251), c(580,580), correct=FALSE)
prop.test(c(10455, 12682),c(14199, 14199), correct = F) 

# post vs. new
prop.test(c(272, 347), c(580, 580), correct=F)
prop.test(c(12682, 12129), c(14199, 14199), correct=F)
```

```{r}
############################################################
## posterior predictions based on empirical Bayesian model
############################################################

M = training[indx,-1]  # states with at least one observation, indx is from the main code chunk
alpha = beta = 0.9     # beta prior p(theta.x), symmetric and concentrate prob mass at 0 and 1
theta.x = (v.x+alpha)/(alpha+beta+n.x) # posterior expected probability for each state x
p.truth.table = as.data.frame(cbind(M, n.x, v.x, theta.x))

write.table(p.truth.table, "probabilistic_truth_table_training.csv", 
            row.names=FALSE, quote=FALSE, sep=",")

n.meds = apply(testing[,-1], 1, sum)
yhat = rep(0, nrow(testing))  # noisy Boolean function
for (i in 1:nrow(testing)) {
  x = testing[i,-1]
  match.idx = row.match(x, M) # {prodlim}, find which state x that testing case is in
  # if no training data, we can still use number of meds as source of information!
  yhat[i] = ifelse(is.na(match.idx), theta.nmeds[n.meds[i]], theta.x[match.idx])
}

# ROC for the predicted probabilities
myRoc = roc(testing$AKI, c(yhat), auc.polygon=T, grid=T)  

# find optimal cutoffs
cutoffs = seq(0, 0.99, by=0.01)
out = NULL
for (i in 1:length(cutoffs)) {
  AKI.hat = ifelse(yhat>cutoffs[i], 1, 0)  # Boolean function given cutoff
  sens = sum(AKI.hat[testing$AKI==1]==1)/sum(testing$AKI==1)
  spec = sum(AKI.hat[testing$AKI==0]==0)/sum(testing$AKI==0)
  d = sqrt((1-sens)^2+(1-spec)^2)
  out = rbind(out, c(cutoffs[i], sens, spec, d))
}
out = as.data.frame(out)
names(out) = c("cutoff", "sens", "spec", "dist")

## optimal = min(sens+spec)
final = out[order(out$dist),][1,]      
# achieve 74% sensitivity and 70% specificity on validation sample
final 

# ROC on testing data
pdf("EB_AKI_ROC.pdf")
plot(myRoc, print.auc=T)
plot.roc(smooth(myRoc), add=T, col="blue")
points(final$spec, final$sens, pch=16, col="red", cex=1.5)
text(final$spec-0.2, final$sens+0.05, 
     paste("(cutoff=", round(final$cutoff,2), ", Spec=", round(final$spec,2), 
           ", Sens=", round(final$sens,2), ")", sep=""), 
     cex=0.75)
dev.off()

# or we can find the best sensitivity with specificity>85% to reduce false alarms
out.1 = subset(out, spec>=0.85)
f.2 = out.1[order(-out.1$sens),][1,]
f.2 

##
## Plot the results from Nephrotoxin_AKI.csv
##
dd = read.csv("Nephrotoxin_AKI.csv", header=T)

for (k in 1:5) {
  pdf(paste("AKI_meds_",k,".pdf",sep=""), width=11, height=18)
  ggplot(subset(dd, n.meds==k), aes(meds, p.AKI, size=p.meds)) + 
    geom_point(colour="red") +
    coord_flip() +
    labs(y=paste("P(AKI=1 | Nephrotoxin(s)=",k,")", sep=""), x="Nephrotoxin(s)") +
    theme(axis.text.y=element_text(size=rel(2/k))) 
    #ggtitle(paste("When", k, "Nephrotoxin are ordered"))
  dev.off()
}

####################################################################################
## End of main code chunk!
####################################################################################
```
